weibull = fitdist(dat1$`Day Ahead Price (HB_North)`, "weibull", method="mle")
par(mfrow=c(2,2))
plot.legend <- c("lognorm", "weibull", "gamma", "norm")
denscomp(list(lndist, weibull), legendtext = plot.legend)
cdfcomp (list(lndist, weibull, gamma, normf), legendtext = plot.legend)
qqcomp  (list(lndist, weibull, gamma, normf), legendtext = plot.legend)
ppcomp  (list(lndist, weibull, gamma, normf), legendtext = plot.legend)
AkaikeInfoCrit_ = round(unlist(lapply(list(lndist,  gamma, weibull, normf), function(x) x["aic"])))
names(AkaikeInfoCrit_) =  c("lognorm", "gamma", "weibull", "norm")
fitstats = data.frame(AkaikeInfoCrit_)
fitstats$BayesianInfoCrit_ = round(unlist(lapply(list(lndist, gamma, weibull, normf), function(x) x["bic"])))
fitstats$LogLikelihood_ = round(unlist(lapply(list(lndist, gamma, weibull, normf), function(x) x["loglik"])))
kable(fitstats, caption="Table of Fits for candidate distributions", align=c("l", "l", "c", "r"))
get_probs = function(vals, biggerthan, smallerthan){
numer = sum(vals > biggerthan & vals < smallerthan)
denom = length(vals)
est = (numer/denom)*100
return(round(est, 2))
}
prob = get_probs(dat1$`Day Ahead Price (HB_North)`, 55, 60)
#prob_55_60 = sum(dat1$`Day Ahead Price (HB_North)` > 55 & dat1$`Day Ahead Price (HB_North)` < 60)/length(dat1$`Day Ahead Price (HB_North)`)
date_frequency <- "1 hour"  # Time step frequency.
# Workbook 2 - this is a more dense time series at the five minute interval
dat2 = read_excel("~/Downloads/Workbook 2_Skills Test.xlsx")
# Thicken this dataset so that it contains the correct outcome - max(price) hourly
dat2_test = dat2 %>%
#filter(`Date/Time` > as.POSIXct("2016-05-15") & `Date/Time` < as.POSIXct("2016-08-16")) %>%
rename(date = `Date/Time`) %>%
rename(rtprice = `Real Time Price (HB_North)` ) %>%
thicken(colname="datehour", interval="hour", by="date", rounding = "down") %>%
group_by(datehour) %>%
summarise(rtprice = max(rtprice)) %>%
fill_gaps(date_col=1, frequency = date_frequency) # fill gaps with NA
dat1_test = dat1 %>%
rename(datehour = date) #%>%
#filter(datehour > as.POSIXct("2016-05-15") & datehour < as.POSIXct("2016-08-16"))
# do the join on date-hour, and be aware that dat2 starts two weeks earlier
dat = dat2_test %>% left_join(dat1_test)
# modify names to avoid issues
names(dat) = gsub("[.]| |\\(|\\)", "_", names(dat))
# FEATURE ENGINEERING
dat_features <- forecastML::create_lagged_df(dat %>%
dplyr::select(-datehour),
outcome_col = 1,
type = "train",
horizons = 1,
lookback = c(1,2,3,4),
frequency = date_frequency,
keep_rows = T)$horizon_1
dat_features %>%
mutate(datehour = dat$datehour)
mutate(target = ifelse(rtprice > 150, "Outlier", "Inlier"))
dat_features = dat_features %>%
mutate(datehour = dat$datehour) %>%
mutate(target = ifelse(rtprice > 150, "Outlier", "Inlier")) %>%
dplyr::select(-rtprice, -datehour)
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(tidyr)
library(readxl)
library(fitdistrplus)
library(stats4)
library(MASS)
# for other necessary test or graphical tools
library(survival)
library(distrMod)
library(lubridate)
library(kableExtra)
library(ggplot2)
library(caret)
library(xgboost)
library(verification)
library(timetk)
library(dplyr)
library(forecastML)
library(padr)
library(reshape2)
dat1 = read_excel("~/Downloads/Workbook 1_Skills Test.xlsx")
ggplot(dat1, aes(x = `Day Ahead Price (HB_North)`)) +
geom_histogram(aes(y = ..density..),
colour = 1, fill = "white", bins=100) +
geom_density(col="red") + scale_x_log10() +ggtitle("Day Ahead Price HB North")
summary(dat1$`Day Ahead Price (HB_North)`)
params = descdist(log(dat1$`Day Ahead Price (HB_North)`), boot=500)
normf = fitdist(dat1$`Day Ahead Price (HB_North)`, "norm", method="mle")
lndist = fitdist(dat1$`Day Ahead Price (HB_North)`, "lnorm", method="mle")
gamma = fitdist(dat1$`Day Ahead Price (HB_North)`, "gamma", method="mle")
weibull = fitdist(dat1$`Day Ahead Price (HB_North)`, "weibull", method="mle")
par(mfrow=c(2,2))
plot.legend <- c("lognorm", "weibull", "gamma", "norm")
denscomp(list(lndist, weibull), legendtext = plot.legend)
cdfcomp (list(lndist, weibull, gamma, normf), legendtext = plot.legend)
qqcomp  (list(lndist, weibull, gamma, normf), legendtext = plot.legend)
ppcomp  (list(lndist, weibull, gamma, normf), legendtext = plot.legend)
AkaikeInfoCrit_ = round(unlist(lapply(list(lndist,  gamma, weibull, normf), function(x) x["aic"])))
names(AkaikeInfoCrit_) =  c("lognorm", "gamma", "weibull", "norm")
fitstats = data.frame(AkaikeInfoCrit_)
fitstats$BayesianInfoCrit_ = round(unlist(lapply(list(lndist, gamma, weibull, normf), function(x) x["bic"])))
fitstats$LogLikelihood_ = round(unlist(lapply(list(lndist, gamma, weibull, normf), function(x) x["loglik"])))
kable(fitstats, caption="Table of Fits for candidate distributions", align=c("l", "l", "c", "r"))
get_probs = function(vals, biggerthan, smallerthan){
numer = sum(vals > biggerthan & vals < smallerthan)
denom = length(vals)
est = (numer/denom)*100
return(round(est, 2))
}
prob = get_probs(dat1$`Day Ahead Price (HB_North)`, 55, 60)
#prob_55_60 = sum(dat1$`Day Ahead Price (HB_North)` > 55 & dat1$`Day Ahead Price (HB_North)` < 60)/length(dat1$`Day Ahead Price (HB_North)`)
date_frequency <- "1 hour"  # Time step frequency.
# Workbook 2 - this is a more dense time series at the five minute interval
dat2 = read_excel("~/Downloads/Workbook 2_Skills Test.xlsx")
# Thicken this dataset so that it contains the correct outcome - max(price) hourly
dat2_test = dat2 %>%
#filter(`Date/Time` > as.POSIXct("2016-05-15") & `Date/Time` < as.POSIXct("2016-08-16")) %>%
rename(date = `Date/Time`) %>%
rename(rtprice = `Real Time Price (HB_North)` ) %>%
thicken(colname="datehour", interval="hour", by="date", rounding = "down") %>%
group_by(datehour) %>%
summarise(rtprice = max(rtprice)) %>%
fill_gaps(date_col=1, frequency = date_frequency) # fill gaps with NA
dat1_test = dat1 %>%
rename(datehour = date) #%>%
#filter(datehour > as.POSIXct("2016-05-15") & datehour < as.POSIXct("2016-08-16"))
# do the join on date-hour, and be aware that dat2 starts two weeks earlier
dat = dat2_test %>% left_join(dat1_test)
# modify names to avoid issues
names(dat) = gsub("[.]| |\\(|\\)", "_", names(dat))
# FEATURE ENGINEERING
dat_features <- forecastML::create_lagged_df(dat %>%
dplyr::select(-datehour),
outcome_col = 1,
type = "train",
horizons = 1,
lookback = c(1,2,3,4),
frequency = date_frequency,
keep_rows = T)$horizon_1
dat_features = dat_features %>%
mutate(datehour = dat$datehour) %>%
mutate(target = ifelse(rtprice > 150, "Outlier", "Inlier")) %>%
dplyr::select(-rtprice)
dat_train = dat_features %>%
filter(datehour < as.POSIXct("2020-01-01")) %>%
dplyr::select(-datehour)
dat_train = dat_features %>%
filter(datehour < as.POSIXct("2020-01-01")) %>%
dplyr::select(-datehour)
?train
tr.stru <- train(target~., data=dat_train,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
summary(dat_train)
dat_train = dat_features %>%
filter(datehour < as.POSIXct("2020-01-01")) %>%
dplyr::select(-datehour) %>%
drop_na()
dat_train = dat_features %>%
filter(datehour < as.POSIXct("2020-01-01")) %>%
dplyr::select(-datehour)%>%
drop_na()
tr.stru <- train(target~., data=dat_train,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
summary(tr.stru)
tr.stru
xgb <- train(target~., data=dat_train,
method = "xgbTree",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
library(caret)
library(mlbench)
install.packages("mlbench")
library(caret)
library(mlbench)
data(Sonar)
cv <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = prSummary,
seeds = set.seed(123))
turn_grid_xgb <- expand.grid(
eta = 0.1,
max_depth = 5,
min_child_weight = 1,
subsample = 0.8,
colsample_bytree = 0.8,
nrounds = c(1,5)*200,
gamma = 0)
set.seed(123)
xgb_1 <- train(Class~., data = Sonar,
method = "xgbTree",
tuneGrid = turn_grid_xgb,
trControl = cv,
verbose = FALSE,
metric = "F",
verbosity = 0)
install.packages("MLmetrics")
library(caret)
library(mlbench)
data(Sonar)
cv <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = prSummary,
seeds = set.seed(123))
turn_grid_xgb <- expand.grid(
eta = 0.1,
max_depth = 5,
min_child_weight = 1,
subsample = 0.8,
colsample_bytree = 0.8,
nrounds = c(1,5)*200,
gamma = 0)
set.seed(123)
xgb_1 <- train(Class~., data = Sonar,
method = "xgbTree",
tuneGrid = turn_grid_xgb,
trControl = cv,
verbose = FALSE,
metric = "F",
verbosity = 0)
cv <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = prSummary,
seeds = set.seed(123))
turn_grid_xgb <- expand.grid(
eta = 0.1,
max_depth = 5,
min_child_weight = 1,
subsample = 0.8,
colsample_bytree = 0.8,
nrounds = c(1,5)*200,
gamma = 0)
xgb <- train(target~., data=dat_train,
method = "xgbTree",
#preProcess = c("center", "scale"),
tuneGrid = turn_grid_xgb,
trControl = cv, verbosity=0)
postResample( predict(xgb, newdata=dat_test)) # the out-sample error
postResample( predict(xgb, newdata=dat_train)) # the out-sample error
xgb
names(dat_test)
dat_test = dat_features %>%
filter(datehour < as.POSIXct("2020-01-01")) %>%
dplyr::select(-datehour)%>%
drop_na()
postResample( predict(xgb, newdata=dat_train)) # the out-sample error
?postResample
postResample( predict(xgb, newdata=dat_test), dat_test$target) # the out-sample error
predict(xgb, newdata=dat_test)
postResample( predict(xgb, newdata=dat_test), dat_test$target) # the out-sample error
preds =  predict(xgb, newdata=dat_test)
length(preds)
length(dat_test$target)
preds =  predict(xgb, newdata=dat_test)
F1_Score(dat_test$target, preds, positive = NULL)
library(MLmetrics)
preds =  predict(xgb, newdata=dat_test)
F1_Score(dat_test$target, preds, positive = NULL)
preds_in_sample =  predict(xgb, newdata=dat_train)
preds_out_of_sample =  predict(xgb, newdata=dat_test)
out = data.frame(F1 = c(F1_Score(dat_train$target, preds_in_sample, positive = NULL),
F1_Score(dat_test$target, preds_out_of_sample, positive = NULL),
Data = c("In-Sample", "Out-Sample")
out
out = data.frame(F1 = c(F1_Score(dat_train$target, preds_in_sample, positive = NULL),
F1_Score(dat_test$target, preds_out_of_sample, positive = NULL)),
Data = c("In-Sample", "Out-Sample"))
out
plot(xtb)
plot(xgb)
auc(xgb)
AUC(xgb)
ROC(xgb)
install.packages("pROC")
install.packages("pROC")
install.packages("pROC")
install.packages("pROC")
install.packages("pROC")
library(pROC)
plot.roc(preds_in_sample, dat_train$target)
caret::plot.roc(preds_in_sample, dat_train$target)
pROC::plot.roc(preds_in_sample, dat_train$target)
predict_proba(xgb, newdata=dat_train)
preds_in_sample
dat_train
predict(xgb, dat_features)
names(dat_features)
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(tidyr)
library(readxl)
library(fitdistrplus)
library(stats4)
library(MASS)
# for other necessary test or graphical tools
library(survival)
library(distrMod)
library(lubridate)
library(kableExtra)
library(ggplot2)
library(caret)
library(xgboost)
library(verification)
library(timetk)
library(dplyr)
library(forecastML)
library(padr)
library(reshape2)
library(MLmetrics)
dat1 = read_excel("~/Downloads/Workbook 1_Skills Test.xlsx")
ggplot(dat1, aes(x = `Day Ahead Price (HB_North)`)) +
geom_histogram(aes(y = ..density..),
colour = 1, fill = "white", bins=100) +
geom_density(col="red") + scale_x_log10() +ggtitle("Day Ahead Price HB North")
summary(dat1$`Day Ahead Price (HB_North)`)
params = descdist(log(dat1$`Day Ahead Price (HB_North)`), boot=500)
normf = fitdist(dat1$`Day Ahead Price (HB_North)`, "norm", method="mle")
lndist = fitdist(dat1$`Day Ahead Price (HB_North)`, "lnorm", method="mle")
gamma = fitdist(dat1$`Day Ahead Price (HB_North)`, "gamma", method="mle")
weibull = fitdist(dat1$`Day Ahead Price (HB_North)`, "weibull", method="mle")
par(mfrow=c(2,2))
plot.legend <- c("lognorm", "weibull", "gamma", "norm")
denscomp(list(lndist, weibull), legendtext = plot.legend)
cdfcomp (list(lndist, weibull, gamma, normf), legendtext = plot.legend)
qqcomp  (list(lndist, weibull, gamma, normf), legendtext = plot.legend)
ppcomp  (list(lndist, weibull, gamma, normf), legendtext = plot.legend)
AkaikeInfoCrit_ = round(unlist(lapply(list(lndist,  gamma, weibull, normf), function(x) x["aic"])))
names(AkaikeInfoCrit_) =  c("lognorm", "gamma", "weibull", "norm")
fitstats = data.frame(AkaikeInfoCrit_)
fitstats$BayesianInfoCrit_ = round(unlist(lapply(list(lndist, gamma, weibull, normf), function(x) x["bic"])))
fitstats$LogLikelihood_ = round(unlist(lapply(list(lndist, gamma, weibull, normf), function(x) x["loglik"])))
kable(fitstats, caption="Table of Fits for candidate distributions", align=c("l", "l", "c", "r"))
get_probs = function(vals, biggerthan, smallerthan){
numer = sum(vals > biggerthan & vals < smallerthan)
denom = length(vals)
est = (numer/denom)*100
return(round(est, 2))
}
prob = get_probs(dat1$`Day Ahead Price (HB_North)`, 55, 60)
#prob_55_60 = sum(dat1$`Day Ahead Price (HB_North)` > 55 & dat1$`Day Ahead Price (HB_North)` < 60)/length(dat1$`Day Ahead Price (HB_North)`)
date_frequency <- "1 hour"  # Time step frequency.
# Workbook 2 - this is a more dense time series at the five minute interval
dat2 = read_excel("~/Downloads/Workbook 2_Skills Test.xlsx")
# Thicken this dataset so that it contains the correct outcome - max(price) hourly
dat2_test = dat2 %>%
#filter(`Date/Time` > as.POSIXct("2016-05-15") & `Date/Time` < as.POSIXct("2016-08-16")) %>%
rename(date = `Date/Time`) %>%
rename(rtprice = `Real Time Price (HB_North)` ) %>%
thicken(colname="datehour", interval="hour", by="date", rounding = "down") %>%
group_by(datehour) %>%
summarise(rtprice = max(rtprice)) %>%
fill_gaps(date_col=1, frequency = date_frequency) # fill gaps with NA
dat1_test = dat1 %>%
rename(datehour = date)
# do the join on date-hour, and be aware that dat2 starts two weeks earlier
dat = dat2_test %>% left_join(dat1_test)
# modify names to avoid issues
names(dat) = gsub("[.]| |\\(|\\)", "_", names(dat))
# FEATURE ENGINEERING
dat_features <- forecastML::create_lagged_df(dat %>%
dplyr::select(-datehour),
outcome_col = 1,
type = "train",
horizons = 1,
lookback = c(1,2,3,4),
frequency = date_frequency,
keep_rows = T)$horizon_1
dat_features = dat_features %>%
mutate(datehour = dat$datehour) %>%
mutate(target = ifelse(rtprice > 150, "Outlier", "Inlier"))
dat_train = dat_features %>%
filter(datehour < as.POSIXct("2020-01-01")) %>%
dplyr::select(-datehour) %>%
drop_na()%>%
dplyr::select(-rtprice)
dat_test = dat_features %>%
filter(datehour < as.POSIXct("2020-01-01")) %>%
dplyr::select(-datehour)%>%
drop_na()%>%
dplyr::select(-rtprice)
#trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
cv <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = prSummary,
seeds = set.seed(123))
turn_grid_xgb <- expand.grid(
eta = 0.1,
max_depth = 5,
min_child_weight = 1,
subsample = 0.8,
colsample_bytree = 0.8,
nrounds = c(1,5)*200,
gamma = 0)
xgb <- train(target~., data=dat_train,
method = "xgbTree",
#preProcess = c("center", "scale"),
tuneGrid = turn_grid_xgb,
trControl = cv, verbosity=0)
xgb
preds_in_sample =  predict(xgb, newdata=dat_train)
preds_out_of_sample =  predict(xgb, newdata=dat_test)
out = data.frame(F1 = c(F1_Score(dat_train$target, preds_in_sample, positive = NULL),
F1_Score(dat_test$target, preds_out_of_sample, positive = NULL)),
Scores = c("   In-Sample  ", "   Out-Sample   "))
kable(out, title="Final Model Scores")
predict(xgb, dat_features %>% drop_na())
datplot = dat_features %>%
drop_na() %>%
mutate(xgb_prediction = predict(xgb, dat_features %>% drop_na()) )
datplot %>%
qplot(aes(x=datehour, y=rtprice, col=xgb_prediction))
qplot(aes(x=datehour, y=rtprice, col=xgb_prediction), data=datplot)
head(datplot)
qplot(data = datplot, aes(x=datehour, y=rtprice, col=xgb_prediction))
qplot(data = datplot, aes(x="datehour", y="rtprice", col="xgb_prediction"))
qplot(datplot$datehour, datplot$rtprice, col=datplot$xgb_prediction)
qplot(datplot$datehour, datplot$rtprice, col=datplot$xgb_prediction) +
xlab("Time") +
ylab("Real-Time Price") +
scale_y_log10()
qplot(datplot$datehour, datplot$rtprice, col=datplot$xgb_prediction) +
xlab("Time") +
ylab("Real-Time Price") +
scale_y_log10() +
geom_hline(yintercept = 150, col="black") +
geom_vline(xintercept = as.POSIXct("2020-01-01")
)
qplot(datplot$datehour, datplot$rtprice, col=datplot$xgb_prediction) +
xlab("Time") +
ylab("Real-Time Price") +
#scale_y_log10() +
geom_hline(yintercept = 150, col="black") +
geom_vline(xintercept = as.POSIXct("2020-01-01", col="grey"))
as.POSIXct("2020-01-01") + 20
qplot(datplot$datehour, datplot$rtprice, col=datplot$xgb_prediction) +
xlab("Time") +
ylab("Real-Time Price") +
#scale_y_log10() +
geom_hline(yintercept = 150, col="black") +
geom_vline(xintercept = as.POSIXct("2020-01-01"), col="grey")+
annotate("text", x=as.POSIXct("2021-01-01"), y=5000, label="TRAIN") +
annotate("text", x=as.POSIXct("2018-01-01"), y=5000, label="TEST")
View(data_train)
160/4
160/40
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(tidyr)
91+17+30+75
213/60
.55*60
library(tidyr)
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(tidyr)
library(readxl)
library(fitdistrplus)
library(stats4)
library(MASS)
# for other necessary test or graphical tools
library(survival)
library(distrMod)
library(lubridate)
library(kableExtra)
library(ggplot2)
library(caret)
library(xgboost)
library(verification)
library(timetk)
library(dplyr)
library(forecastML)
library(padr)
library(reshape2)
library(MLmetrics)
dat1 = read_excel("~/Downloads/Workbook 1_Skills Test.xlsx")
ggplot(dat1, aes(x = `Day Ahead Price (HB_North)`)) +
geom_histogram(aes(y = ..density..),
colour = 1, fill = "white", bins=100) +
geom_density(col="red") + scale_x_log10() +ggtitle("Day Ahead Price HB North")
View(dat1)
date_frequency <- "1 hour"  # Time step frequency.
# Workbook 2 - this is a more dense time series at the five minute interval
dat2 = read_excel("~/Downloads/Workbook 2_Skills Test.xlsx")
View(dat2)
library(dplyr)
library(ggplot2)
library(lme4)
library(readr)
library(stargazer)
library(effects)
library(packrat)
setwd("~/github/facing_voters/src/R/packrat")
packrat::restor()
packrat::restore()
?packrat::snapshot
packrat::snapshot(snapshot.sources = F)
getwd()
packrat::snapshot(snapshot.sources = F)
packrat::init()
warnings()
?init
